import pandas as pd
import numpy as np
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline
plt.style.use(['bmh']) # bmh Styling used for Visulization
df = pd.read_csv("MOCK_DATA csv.csv")
print ('Data read into a pandas dataframe!')
Data read into a pandas dataframe!
df.head() #Calling first 5 rows
id | first_name | last_name | gender | ip_address | Car | Colour | company | ||
---|---|---|---|---|---|---|---|---|---|
0 | 1 | Miguelita | Ticic | mticic0@ted.com | Female | 45.216.118.239 | Honda | Pink | Hagenes, Hayes and Roob |
1 | 2 | Cami | Sibery | csibery1@nationalgeographic.com | Female | 193.140.90.251 | Mazda | Crimson | Huels LLC |
2 | 3 | Melany | Iskower | miskower2@latimes.com | Female | 146.18.2.124 | Hyundai | Teal | Hessel-Lebsack |
3 | 4 | Eb | Shurman | eshurman3@europa.eu | Male | 37.3.25.144 | Land Rover | Purple | Walker-Bogisich |
4 | 5 | Emery | Springett | espringett4@livejournal.com | Male | 15.31.162.5 | Pontiac | Aquamarine | Nader, Lynch and Wiza |
df.dtypes #Checking Datatypes
id int64 first_name object last_name object email object gender object ip_address object Car object Colour object company object dtype: object
df.info(verbose=False) #Information about the dataframe
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Columns: 9 entries, id to company dtypes: int64(1), object(8) memory usage: 70.4+ KB
df.isnull().sum()
id 0 first_name 0 last_name 0 email 0 gender 0 ip_address 0 Car 0 Colour 0 company 0 dtype: int64
df.corr() #Checking Correlation between columns
id | |
---|---|
id | 1.0 |
To view the dimensions of the dataframe, we use the .shape parameter.
df.shape # size of dataframe (rows, columns)
(1000, 9)
df.columns
Index(['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address', 'Car', 'Colour', 'company'], dtype='object')
df1 = df[['gender','Car','Colour']] #creating new dataframe
df1
gender | Car | Colour | |
---|---|---|---|
0 | Female | Honda | Pink |
1 | Female | Mazda | Crimson |
2 | Female | Hyundai | Teal |
3 | Male | Land Rover | Purple |
4 | Male | Pontiac | Aquamarine |
... | ... | ... | ... |
995 | Male | Mazda | Maroon |
996 | Male | Buick | Pink |
997 | Female | Chevrolet | Crimson |
998 | Female | Volkswagen | Violet |
999 | Male | Jeep | Blue |
1000 rows × 3 columns
dummies = pd.get_dummies(df1.Car) merged = pd.concat([df1, dummies], axis='columns') merged = merged.drop(['Car'], axis='columns') merged
df1 = df1[(df1.gender == "Male") | (df1.gender == "Female")]
df1
gender | Car | Colour | |
---|---|---|---|
0 | Female | Honda | Pink |
1 | Female | Mazda | Crimson |
2 | Female | Hyundai | Teal |
3 | Male | Land Rover | Purple |
4 | Male | Pontiac | Aquamarine |
... | ... | ... | ... |
995 | Male | Mazda | Maroon |
996 | Male | Buick | Pink |
997 | Female | Chevrolet | Crimson |
998 | Female | Volkswagen | Violet |
999 | Male | Jeep | Blue |
902 rows × 3 columns
df2 = df1.groupby(['gender','Colour'],as_index=False).count() #Using groupby function to create required pivot table
df2
gender | Colour | Car | |
---|---|---|---|
0 | Female | Aquamarine | 37 |
1 | Female | Blue | 20 |
2 | Female | Crimson | 20 |
3 | Female | Fuscia | 24 |
4 | Female | Goldenrod | 24 |
5 | Female | Green | 27 |
6 | Female | Indigo | 29 |
7 | Female | Khaki | 17 |
8 | Female | Maroon | 23 |
9 | Female | Mauv | 21 |
10 | Female | Orange | 37 |
11 | Female | Pink | 25 |
12 | Female | Puce | 17 |
13 | Female | Purple | 14 |
14 | Female | Red | 24 |
15 | Female | Teal | 22 |
16 | Female | Turquoise | 26 |
17 | Female | Violet | 20 |
18 | Female | Yellow | 28 |
19 | Male | Aquamarine | 25 |
20 | Male | Blue | 25 |
21 | Male | Crimson | 23 |
22 | Male | Fuscia | 23 |
23 | Male | Goldenrod | 29 |
24 | Male | Green | 25 |
25 | Male | Indigo | 21 |
26 | Male | Khaki | 25 |
27 | Male | Maroon | 26 |
28 | Male | Mauv | 22 |
29 | Male | Orange | 25 |
30 | Male | Pink | 29 |
31 | Male | Puce | 23 |
32 | Male | Purple | 22 |
33 | Male | Red | 21 |
34 | Male | Teal | 20 |
35 | Male | Turquoise | 21 |
36 | Male | Violet | 15 |
37 | Male | Yellow | 27 |
grouped_pivot = df2.pivot(index='Colour',columns='gender',values='Car') #Creating Pivot Table
grouped_pivot.head()
gender | Female | Male |
---|---|---|
Colour | ||
Aquamarine | 37 | 25 |
Blue | 20 | 25 |
Crimson | 20 | 23 |
Fuscia | 24 | 23 |
Goldenrod | 24 | 29 |
grouped_pivot.plot(kind='bar',figsize=(15,10),color=['pink','b'])
plt.title('Color of Cars and Their Ownership by Gender',fontsize = 20)
plt.xlabel('Colour',fontsize = 15)
plt.ylabel('Gender',fontsize = 15)
plt.legend(fontsize = 10)
plt.show()
df3 = df1.groupby(['gender','Car'],as_index=False).count() #Using groupby function to create required pivot table
df3.head()
gender | Car | Colour | |
---|---|---|---|
0 | Female | Acura | 16 |
1 | Female | Aston Martin | 5 |
2 | Female | Audi | 12 |
3 | Female | BMW | 20 |
4 | Female | Bentley | 2 |
grouped_pivot2 = df3.pivot(index='Car',columns='gender',values='Colour') #Creating Pivot Table
grouped_pivot2.head()
gender | Female | Male |
---|---|---|
Car | ||
Acura | 16.0 | 7.0 |
Alfa Romeo | NaN | 2.0 |
Aston Martin | 5.0 | 1.0 |
Audi | 12.0 | 15.0 |
BMW | 20.0 | 14.0 |
grouped_pivot2.plot(kind='bar',stacked = False,figsize=(25,10))
plt.title('Car Model and Their Ownership by Gender',fontsize = 20)
plt.xlabel('Car Model',fontsize = 15)
plt.ylabel('Gender Count',fontsize = 15)
plt.legend(fontsize = 10)
plt.show()
df.columns
Index(['id', 'first_name', 'last_name', 'email', 'gender', 'ip_address', 'Car', 'Colour', 'company'], dtype='object')
df4 = df[['gender','Car','Colour','company']]
df5 = df4.apply(lambda x: x.factorize()[0]).corr()
df5
gender | Car | Colour | company | |
---|---|---|---|---|
gender | 1.000000 | -0.041169 | 0.010857 | 0.026105 |
Car | -0.041169 | 1.000000 | 0.026560 | 0.050811 |
Colour | 0.010857 | 0.026560 | 1.000000 | -0.043856 |
company | 0.026105 | 0.050811 | -0.043856 | 1.000000 |
plt.figure(figsize=(4, 10))
heatmap = sns.heatmap(df5[['company']][0:].sort_values(by='company', ascending=False), vmin=-1, vmax=1, annot=True, cmap='RdBu')
heatmap.set_title('Features and their Correlation with company Column', fontdict={'fontsize':18}, pad=16)
#plt.savefig('Male.png', dpi=300, bbox_inches='tight')
Text(0.5, 1.0, 'Features and their Correlation with company Column')